banner

Predict whether an aircraft strike with wildlife causes damage

Kaggle link

Twitch link

This time I’m trying an XGBoost model.

Load libraries

library(tidyverse)
library(tidymodels)
library(scales)
library(skimr)
library(kableExtra)
library(patchwork)
library(gt)
library(lubridate)
library(glue)
library(themis)
library(credentials)

My settings

# colours
#pal_jo <- c(viridisLite::magma(8)[2:8], "#4C4C53", "#9B9BA8")

pal_sliced <- c(
  '#e946a5', # pink
  '#68e199', # green
  '#302df0', # blue
  '#6a45b0', # purple
  '#129875', # darker green
  '#4f94bf', # lighter blue
  '#b545ab', # darker pink
  '#000000', # black,
  '#cccccc'  #grey
)

title_colour <- pal_sliced[1]
table_colour <- pal_sliced[4]
bar_colour <- pal_sliced[2]
grid_colour <- pal_sliced[9]

theme_set(theme_bw() %+replace%
    theme(
      # align title and caption to the plot not the panel
      plot.title.position = 'plot',
      plot.caption.position = 'plot',
      # change the title and caption to markdown and move them futher from the plot
      plot.title = element_text(
        size = rel(1.3),
        hjust = 0, 
        margin = margin(c(0, 0, 10, 0)),
        colour = title_colour
      ),
      plot.subtitle = element_text(
        size = rel(1.15),
        hjust = 0, 
        margin = margin(c(0, 0, 15, 0))
      ),
      plot.caption = element_text(
        hjust = 1, 
        margin = margin(c(10, 0, 0, 0))
      ),
      # move axis titles to the left/top and change them to markdown
      axis.title = element_text(hjust = 1),
      # allow the axis values to the markdown as well
      axis.text = element_text(),
      # remove the panel border
      panel.border = element_blank(),
      # put in the axis lines with a slightly thicker line than the gridlines
      axis.line = element_line(colour = grid_colour, size = rel(1.5)),
      # make the tickmarks the same colour
      axis.ticks = element_line(colour = grid_colour),
      # facet strip text left aligned with extra space above
      strip.text = element_text(
        hjust = 0, margin = margin(c(10, 0, 0, 0)), colour = title_colour
      ),
      # clear colour and fill for strip
      strip.background = element_rect(colour = NA, fill = NA),
      # dotted gridlines
      panel.grid = element_line(linetype = 'dotted'),
      # ability to use a different colour for the gridlines
      panel.grid.major.x = element_line(colour = grid_colour),
      panel.grid.major.y = element_line(colour = grid_colour),
      panel.grid.minor.x = element_blank(),
      panel.grid.minor.y = element_blank(),
    )
)

scale_y_pct <- function(
  accuracy = 1L, 
  breaks = pretty_breaks(),
  expand = expansion(mult = c(0, .05)),
  ...
) {
  scale_y_continuous(
    labels = scales::percent_format(accuracy = accuracy, big.mark = ","),
    breaks = breaks,
    expand = expand,
    ...
  )
}

scale_y_comma <- function(
  accuracy = 1L, 
  breaks = pretty_breaks(),
  expand = expansion(mult = c(0, .05)),
  ...
) {
  scale_y_continuous(
    labels = scales::comma_format(accuracy = accuracy),
    breaks = breaks,
    expand = expand,
    ...
  )
}

scale_fill_jo <- function(...) {
  scale_fill_manual(values = pal_sliced, ...)
}

scale_fill_discrete <- scale_fill_jo

update_geom_defaults("bar", list(fill = bar_colour))
update_geom_defaults("col", list(fill = bar_colour))
update_geom_defaults("point", list(colour = bar_colour))
update_geom_defaults("line", list(colour = bar_colour))

Read files

to_build <- read_csv("s01e02/train.csv", guess_max = 200000) 

── Column specification ────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  id = col_double(),
  incident_year = col_double(),
  incident_month = col_double(),
  incident_day = col_double(),
  aircraft_model = col_double(),
  aircraft_mass = col_double(),
  engine_make = col_double(),
  engines = col_double(),
  engine1_position = col_double(),
  engine2_position = col_double(),
  engine4_position = col_double(),
  height = col_double(),
  speed = col_double(),
  distance = col_double(),
  damaged = col_double()
)
ℹ Use `spec()` for the full column specifications.
to_score <- read_csv("s01e02/test.csv", guess_max = 200000) 

── Column specification ────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  id = col_double(),
  incident_year = col_double(),
  incident_month = col_double(),
  incident_day = col_double(),
  aircraft_model = col_double(),
  aircraft_mass = col_double(),
  engine_make = col_double(),
  engines = col_double(),
  engine1_position = col_double(),
  engine2_position = col_double(),
  engine4_position = col_double(),
  height = col_double(),
  speed = col_double(),
  distance = col_double()
)
ℹ Use `spec()` for the full column specifications.

Examine data

skim(to_build)
Data summary
Name to_build
Number of rows 21000
Number of columns 34
_______________________
Column type frequency:
character 19
numeric 15
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
operator_id 0 1.00 3 5 0 276 0
operator 0 1.00 3 33 0 275 0
aircraft 0 1.00 3 20 0 424 0
aircraft_type 4992 0.76 1 1 0 2 0
aircraft_make 5231 0.75 2 3 0 62 0
engine_model 6334 0.70 1 2 0 39 0
engine_type 5703 0.73 1 3 0 8 0
engine3_position 19671 0.06 1 11 0 4 0
airport_id 0 1.00 3 5 0 1039 0
airport 34 1.00 4 53 0 1038 0
state 2664 0.87 2 2 0 60 0
faa_region 2266 0.89 3 3 0 14 0
flight_phase 6728 0.68 4 12 0 12 0
visibility 7699 0.63 3 7 0 5 0
precipitation 10327 0.51 3 15 0 8 0
species_id 0 1.00 1 6 0 447 0
species_name 7 1.00 4 50 0 445 0
species_quantity 532 0.97 1 8 0 4 0
flight_impact 8944 0.57 4 21 0 6 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 14980.94 8663.24 1 7458.75 14978.5 22472.25 30000 ▇▇▇▇▇
incident_year 0 1.00 2006.06 6.72 1990 2001.00 2007.0 2012.00 2015 ▂▃▅▆▇
incident_month 0 1.00 7.19 2.79 1 5.00 8.0 9.00 12 ▃▅▆▇▆
incident_day 0 1.00 15.63 8.82 1 8.00 15.0 23.00 31 ▇▇▇▇▆
aircraft_model 6259 0.70 24.65 21.70 0 10.00 22.0 37.00 98 ▇▆▂▁▁
aircraft_mass 5694 0.73 3.50 0.89 1 3.00 4.0 4.00 5 ▁▁▂▇▁
engine_make 6155 0.71 21.22 11.04 1 10.00 22.0 34.00 47 ▇▂▆▇▁
engines 5696 0.73 2.05 0.46 1 2.00 2.0 2.00 4 ▁▇▁▁▁
engine1_position 5838 0.72 2.99 2.09 1 1.00 1.0 5.00 7 ▇▁▂▅▁
engine2_position 6776 0.68 2.91 2.01 1 1.00 1.0 5.00 7 ▇▁▂▅▁
engine4_position 20650 0.02 2.02 1.43 1 1.00 1.0 4.00 5 ▇▁▁▃▁
height 8469 0.60 819.24 1772.53 0 0.00 50.0 800.00 24000 ▇▁▁▁▁
speed 12358 0.41 141.39 52.25 0 120.00 137.0 160.00 2500 ▇▁▁▁▁
distance 8913 0.58 0.66 3.33 0 0.00 0.0 0.00 100 ▇▁▁▁▁
damaged 0 1.00 0.09 0.28 0 0.00 0.0 0.00 1 ▇▁▁▁▁


Initial thoughts

  • Lots of missing values!
  • Some of the character variables have lots of categories
  • Missing value for the target variable
explore_build <- to_build %>% 
  filter(! is.na(damaged)) %>% 
  mutate(
    damaged_factor = factor(if_else(damaged == 1, 'Damaged', 'Not Damaged')),
    date = ymd(paste(incident_year, incident_month, incident_day, 
      sep = '-')),
    dow = wday(date, label = TRUE, abbr = TRUE),
    log_height= log(height),
    log_speed= log(speed),
    log_distance = log(distance),
    aircraft_mass = factor(aircraft_mass)
  )

Look at sample

set.seed(617)

explore_build %>% 
  select(damaged, damaged_factor, everything()) %>% 
  slice_sample(n = 5) %>% 
  gather(-id, key = 'key', value = 'value') %>% 
  spread(key = id, value = value) %>% 
  arrange(-str_detect(key, 'damag')) %>% 
  gt()
key 8896 17349 20384 21944 23470
damaged 0 0 0 0 0
damaged_factor Not Damaged Not Damaged Not Damaged Not Damaged Not Damaged
aircraft DHC8 DASH 8 B-767-200 UNKNOWN UNKNOWN C-560
aircraft_make 303 148 NA NA 226
aircraft_mass 3 4 NA NA 1
aircraft_model 10 29 NA NA 45
aircraft_type A A NA NA A
airport BRADLEY INTL UNKNOWN GEORGE BUSH INTERCONTINENTAL/ HOUSTON ARPT YOUNGSTOWN-WARREN RGNL ARPT UNKNOWN
airport_id KBDL ZZZZ KIAH KYNG ZZZZ
date 15836 13671 14762 14419 12324
distance 2 NA 0 NA NA
dow Sat Thu Wed Wed Mon
engine_make 31 22 NA NA 31
engine_model 10 7 NA NA 1
engine_type C D NA NA D
engine1_position 4 1 NA NA 5
engine2_position 4 1 NA NA 5
engine3_position NA NA NA NA NA
engine4_position NA NA NA NA NA
engines 2 2 NA NA 2
faa_region ANE NA ASW AGL NA
flight_impact NONE NONE NA NA NA
flight_phase APPROACH NA NA TAKEOFF RUN NA
height 800 NA NA NA NA
incident_day 11 7 2 24 29
incident_month 5 6 6 6 9
incident_year 2013 2007 2010 2009 2003
log_distance 0.693147180559945 NA -Inf NA NA
log_height 6.68461172766793 NA NA NA NA
log_speed 4.74493212836325 NA NA 4.97673374242057 NA
operator COMMUTAIR ABX AIR UNKNOWN MILITARY NETJETS
operator_id UCA ABX UNK MIL NJA
precipitation NONE NA NA NA NA
species_id UNKBS UNKBM I1301 UNKBM UNKBM
species_name UNKNOWN SMALL BIRD UNKNOWN MEDIUM BIRD CATTLE EGRET UNKNOWN MEDIUM BIRD UNKNOWN MEDIUM BIRD
species_quantity 1 1 1 NA 1
speed 115 NA NA 145 NA
state CT NA TX OH NA
visibility NIGHT NA NA DAY NA

Target - damaged

  • unbalanced
  • was a missing value
summary(explore_build$damaged)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.00000 0.00000 0.08567 0.00000 1.00000 
ggplot(explore_build, aes(damaged_factor, fill = damaged_factor)) +
  geom_bar() +
  scale_y_comma() +
  labs(title = 'Damaged', x = NULL, y = NULL, fill = 'Damaged') +
  guides(fill = 'none')

Function for exploration - numeric

explore_num <- function(x, ...){

a <- explore_build %>% 
  mutate(is_it_na = if_else(is.na({{x}}), 'NA', 'Present')) %>% 
ggplot(aes(is_it_na, fill = damaged_factor)) +
  geom_bar() +
  scale_y_comma() +
  labs(x = NULL, y = NULL, fill = 'Damaged') +
  guides(fill = 'none')

b <- explore_build %>% 
  mutate(is_it_na = if_else(is.na({{x}}), 'NA', 'Present')) %>% 
ggplot(aes(is_it_na, fill = damaged_factor)) +
  geom_bar(position = 'fill') +
  scale_y_pct() +
  labs(x = NULL, y = NULL, fill = 'Damaged') +
  guides(fill = 'none')

c <- ggplot(explore_build, aes({{x}}, fill = damaged_factor)) +
  geom_histogram( ...) +
  scale_y_comma() +
  labs(x = NULL, y = NULL, fill = 'Damaged') +
  guides(fill = 'none')

d <- ggplot(explore_build, aes({{x}}, fill = damaged_factor)) +
  geom_histogram(position = 'fill', ...) +
  scale_y_pct() +
  labs(x = NULL, y = NULL, fill = 'Damaged') +
  guides(fill = 'none')

(a + b) / (c + d) 

}

Function for exploration - category

explore_char <- function(x, ...){

c <- explore_build %>% 
  mutate(
    tmp = fct_infreq(fct_lump_n({{x}}, 20))
  ) %>% 
ggplot(aes(tmp, fill = damaged_factor)) +
  geom_bar() +
  scale_y_comma() +
  scale_x_discrete(label = label_wrap(15)) +
  labs(x = NULL, y = NULL, fill = 'Damaged') +
  coord_flip() +
  guides(fill = 'none')

d <- explore_build %>% 
  mutate(
    tmp = fct_infreq(fct_lump_n({{x}}, 20))
  ) %>% 
ggplot(aes(tmp, fill = damaged_factor)) +
  geom_bar(position = 'fill') +
  scale_y_pct() +
  scale_x_discrete(label = label_wrap(15)) +
  labs(x = NULL, y = NULL, fill = 'Damaged') +
  coord_flip() +
  guides(fill = 'none')

(c + d) 
}

incident_year

Year the incident occurred (1990-2015)

  • No missing data
  • Rate decreasing over time
explore_num(incident_year, binwidth = 1)

incident_month

Month the incident occurred (1-12)

  • Change to factor
  • seasonal impact
explore_num(incident_month, binwidth = 1)

incident_day

Day the incident occurred (1-31)

  • Omit raw
  • Could combine with month and year to get date and use step_date?
  • Public holidays could be an issue step_holiday
  • Day of week might be more useful - Not useful
explore_char(dow)

operator_id / operator

International Civil Aviation Organization (ICAO) code for aircraft operators / Name of the Airline

  • Omit operator_id
  • some operators have higher rates
explore_char(operator)

aircraft

Model of aircraft involved in the strike

For example: B-777 means a Boeing 777 was struck in the incident

explore_char(aircraft)

aircraft_type

  • ‘A’: “multi-engined aeroplanes powered by turbo-propeller engines with an MOPSC of more than nine or a maximum take-off mass exceeding 5 700 kg, and all multi-engined turbo-jet powered aeroplanes”
  • ‘B’: “aeroplanes powered by propeller engines with an MOPSC of nine or less and a maximum take-off mass of 5 700 kg or less”
explore_char(aircraft_type)

aircraft_make

International Civil Aviation Organization (ICAO) code for Aircraft Make

explore_char(aircraft_make)

aircraft_model

International Civil Aviation Organization (ICAO) code for Aircraft Model

  • should be factor
explore_num(aircraft_model, binwidth = 10)

aircraft_mass

  • 1 = 2,250 kg or less

  • 2 = 2,251-5,700 kg

  • 3 = 5,701-27,000 kg

  • 4 = 27,001-272,000 kg

  • 5 = above 272,000 kg

  • convert to factor

explore_char(aircraft_mass)

engine_make

Engine Make Code

  • should be a factor
explore_num(engine_make, binwidth = 1)

engine_model

Engine Model Code

explore_char(engine_model)

engines

Number of engines powering the aircraft

explore_num(engines, binwidth = 1)

engine_type

The way the engine is powered

  • ‘A’ - Reciprocating
  • ‘B’ - Turbojet
  • ‘C’ - Turboprop
  • ‘D’ - Turbofan
  • ‘E’ - None (glider)
  • ‘F’ - Turboshaft (helicopter)
  • Special Note: Some aircrafts can have two engine types
    • Example: ‘B/D’
explore_char(engine_type)

engine1_position

Where engine 1 is mounted on aircraft

  • should be factor
explore_num(engine1_position)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

engine2_position

Where engine 2 is mounted on aircraft

  • should be a factor
explore_num(engine2_position)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

engine3_position

Where engine 3 is mounted on aircraft

  • omit

engine4_position

Where engine 4 is mounted on aircraft

  • omit

airport_id / airport

Location of the strike given as the International Civil Aviation Organization (ICAO) airport identifier / Name of the airport where the strike occurred

explore_char(airport)

state

Abbreviated state from US, Canada

explore_char(state)

faa_region

FAA region where the airport is located

  • The US has 9 regions (AAL, ACE, AEA, AGL, ANE, ANM, ASO, ASW, AWP)
  • Canada has 5 regions (ATL, AWP, ONT, P&N, PAC, QUE)
  • All other regions of the world are considered foreign regions (FGN)
explore_char(faa_region)

flight_phase

Phase of flight during which the strike occured

  • APPROACH
  • ARRIVAL
  • CLIMB
  • DEPARTURE
  • DESCENT
  • EN ROUTE
  • LANDING
  • LANDING ROLL
  • LOCAL
  • PARKED
  • TAKEOFF RUN
  • TAXI
explore_char(flight_phase)

visibility

Time of day when the strike occurred

  • DAWN
  • DAY
  • DUSK
  • NIGHT
  • UNKNOWN
explore_char(visibility)

precipitation

  • FOG -RAIN
  • SNOW
  • NONE
  • Special Note: Can have multiple values
    • Example: FOG, RAIN
  • Is it worth splitting?
explore_char(precipitation)

height

Feet About Ground Level where the strike occurred (0-24,000)

explore_num(height, bins = 50)

Try log

explore_num(log_height, bins = 50)

speed

Velocity in Knots (0-2500)

  • trim high values
explore_num(speed, bins = 50)

Try log

explore_num(log_speed, bins = 50)

distance

How many miles from the airport (0-100)

explore_num(distance, binwidth = 5)

Try log

explore_num(log_distance, bins = 50)

species_id / species_name

International Civil Aviation Organization (ICAO) code for type of wildlife / Common name for type of wildlife struck

  • just use name
explore_char(species_name)

species_quantity

Amount of wildlife struck

  • 1
  • 2-10
  • 11-100
  • Over 100
explore_char(species_quantity)

flight_impact

How the strike impacted the flight

  • ABORTED
  • TAKEOFF
  • ENGINE SHUTDOWN
  • NONE
  • OTHER
  • PRECAUTIONARY LANDING
explore_char(flight_impact)

Split in to train and test

set.seed(728)

to_build <- to_build %>% 
  filter(! is.na(damaged)) %>% 
  mutate(damaged = factor(damaged, levels = c(1, 0)))

split <- initial_split(to_build, strata = damaged)
train <- training(split)
test <- testing(split)

Create folds

set.seed(530)
train_folds <- vfold_cv(train, 5)

Create recipe

rec <- recipe(damaged ~ ., data = train) %>%
  # id role
  update_role(id, new_role = "ID") %>%
  # create variables
  step_mutate(
    speed = if_else(speed > 500, 500, speed)
  ) %>% 
  # numbers to characters
  step_mutate(
    incident_month = as.character(incident_month), 
    aircraft_mass = as.character(aircraft_mass), 
    aircraft_model = as.character(aircraft_model), 
    engine_make = as.character(engine_make), 
    engine1_position = as.character(engine1_position), 
    engine2_position = as.character(engine2_position),
    engines = as.character(engines)
  ) %>% 
  # remove variables
  step_rm(operator_id, incident_day, species_id, airport_id, 
    engine3_position, engine4_position) %>%
  # flag for missing values
  step_indicate_na(height, speed, distance) %>% 
  # imputation of missing values
  step_impute_mean(height, speed, distance) %>% 
  # character to factors
  step_string2factor(all_nominal_predictors()) %>% 
  # explicit na
  step_unknown(all_nominal_predictors()) %>% 
  # handle new classes
  step_novel(all_nominal(), -all_outcomes()) %>% 
  # lump together small values
  step_other(all_nominal_predictors()) %>% 
  # all numeric
  step_dummy(all_nominal_predictors()) %>% 
  # remove if zero variance
  step_zv(all_predictors()) %>% 
  # rebalance
  step_downsample(damaged)

Check recipe outputs

prepped <- prep(rec)
baked <- bake(prepped, new_data = NULL) 

skim(baked)
Data summary
Name baked
Number of rows 2612
Number of columns 89
_______________________
Column type frequency:
factor 1
numeric 88
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
damaged 0 1 FALSE 2 1: 1306, 0: 1306

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 14959.53 8582.28 2 7622.75 15026.00 22121.75 30000 ▇▇▇▇▇
incident_year 0 1 2004.79 7.02 1990 2000.00 2006.00 2011.00 2015 ▃▅▆▆▇
height 0 1 1018.77 1850.34 0 3.00 836.68 836.68 24000 ▇▁▁▁▁
speed 0 1 142.35 34.66 4 140.00 141.74 141.74 350 ▁▇▂▁▁
distance 0 1 0.90 3.76 0 0.00 0.69 0.69 100 ▇▁▁▁▁
na_ind_height 0 1 0.32 0.47 0 0.00 0.00 1.00 1 ▇▁▁▁▃
na_ind_speed 0 1 0.52 0.50 0 0.00 1.00 1.00 1 ▇▁▁▁▇
na_ind_distance 0 1 0.53 0.50 0 0.00 1.00 1.00 1 ▇▁▁▁▇
incident_month_X11 0 1 0.09 0.29 0 0.00 0.00 0.00 1 ▇▁▁▁▁
incident_month_X3 0 1 0.07 0.25 0 0.00 0.00 0.00 1 ▇▁▁▁▁
incident_month_X4 0 1 0.08 0.27 0 0.00 0.00 0.00 1 ▇▁▁▁▁
incident_month_X5 0 1 0.09 0.29 0 0.00 0.00 0.00 1 ▇▁▁▁▁
incident_month_X6 0 1 0.07 0.26 0 0.00 0.00 0.00 1 ▇▁▁▁▁
incident_month_X7 0 1 0.10 0.29 0 0.00 0.00 0.00 1 ▇▁▁▁▁
incident_month_X8 0 1 0.13 0.33 0 0.00 0.00 0.00 1 ▇▁▁▁▁
incident_month_X9 0 1 0.13 0.33 0 0.00 0.00 0.00 1 ▇▁▁▁▁
incident_month_other 0 1 0.13 0.33 0 0.00 0.00 0.00 1 ▇▁▁▁▁
operator_BUSINESS 0 1 0.16 0.36 0 0.00 0.00 0.00 1 ▇▁▁▁▂
operator_SOUTHWEST.AIRLINES 0 1 0.07 0.26 0 0.00 0.00 0.00 1 ▇▁▁▁▁
operator_UNITED.AIRLINES 0 1 0.07 0.26 0 0.00 0.00 0.00 1 ▇▁▁▁▁
operator_UNKNOWN 0 1 0.13 0.33 0 0.00 0.00 0.00 1 ▇▁▁▁▁
operator_other 0 1 0.52 0.50 0 0.00 1.00 1.00 1 ▇▁▁▁▇
aircraft_UNKNOWN 0 1 0.13 0.34 0 0.00 0.00 0.00 1 ▇▁▁▁▁
aircraft_other 0 1 0.81 0.39 0 1.00 1.00 1.00 1 ▂▁▁▁▇
aircraft_type_unknown 0 1 0.14 0.34 0 0.00 0.00 0.00 1 ▇▁▁▁▁
aircraft_type_other 0 1 0.02 0.16 0 0.00 0.00 0.00 1 ▇▁▁▁▁
aircraft_make_X148 0 1 0.29 0.45 0 0.00 0.00 1.00 1 ▇▁▁▁▃
aircraft_make_X188 0 1 0.05 0.22 0 0.00 0.00 0.00 1 ▇▁▁▁▁
aircraft_make_X583 0 1 0.08 0.27 0 0.00 0.00 0.00 1 ▇▁▁▁▁
aircraft_make_unknown 0 1 0.15 0.35 0 0.00 0.00 0.00 1 ▇▁▁▁▂
aircraft_make_other 0 1 0.35 0.48 0 0.00 0.00 1.00 1 ▇▁▁▁▅
aircraft_model_X24 0 1 0.07 0.25 0 0.00 0.00 0.00 1 ▇▁▁▁▁
aircraft_model_unknown 0 1 0.19 0.39 0 0.00 0.00 0.00 1 ▇▁▁▁▂
aircraft_model_other 0 1 0.70 0.46 0 0.00 1.00 1.00 1 ▃▁▁▁▇
aircraft_mass_X4 0 1 0.48 0.50 0 0.00 0.00 1.00 1 ▇▁▁▁▇
aircraft_mass_unknown 0 1 0.17 0.37 0 0.00 0.00 0.00 1 ▇▁▁▁▂
aircraft_mass_other 0 1 0.21 0.41 0 0.00 0.00 0.00 1 ▇▁▁▁▂
engine_make_X22 0 1 0.12 0.33 0 0.00 0.00 0.00 1 ▇▁▁▁▁
engine_make_X31 0 1 0.08 0.28 0 0.00 0.00 0.00 1 ▇▁▁▁▁
engine_make_X34 0 1 0.16 0.36 0 0.00 0.00 0.00 1 ▇▁▁▁▂
engine_make_unknown 0 1 0.18 0.39 0 0.00 0.00 0.00 1 ▇▁▁▁▂
engine_make_other 0 1 0.28 0.45 0 0.00 0.00 1.00 1 ▇▁▁▁▃
engine_model_X10 0 1 0.17 0.38 0 0.00 0.00 0.00 1 ▇▁▁▁▂
engine_model_X4 0 1 0.12 0.32 0 0.00 0.00 0.00 1 ▇▁▁▁▁
engine_model_unknown 0 1 0.20 0.40 0 0.00 0.00 0.00 1 ▇▁▁▁▂
engine_model_other 0 1 0.24 0.43 0 0.00 0.00 0.00 1 ▇▁▁▁▂
engines_unknown 0 1 0.17 0.37 0 0.00 0.00 0.00 1 ▇▁▁▁▂
engines_other 0 1 0.17 0.37 0 0.00 0.00 0.00 1 ▇▁▁▁▂
engine_type_D 0 1 0.60 0.49 0 0.00 1.00 1.00 1 ▅▁▁▁▇
engine_type_unknown 0 1 0.17 0.37 0 0.00 0.00 0.00 1 ▇▁▁▁▂
engine_type_other 0 1 0.14 0.34 0 0.00 0.00 0.00 1 ▇▁▁▁▁
engine1_position_X4 0 1 0.11 0.31 0 0.00 0.00 0.00 1 ▇▁▁▁▁
engine1_position_X5 0 1 0.24 0.43 0 0.00 0.00 0.00 1 ▇▁▁▁▂
engine1_position_unknown 0 1 0.18 0.38 0 0.00 0.00 0.00 1 ▇▁▁▁▂
engine1_position_other 0 1 0.11 0.31 0 0.00 0.00 0.00 1 ▇▁▁▁▁
engine2_position_X4 0 1 0.11 0.31 0 0.00 0.00 0.00 1 ▇▁▁▁▁
engine2_position_X5 0 1 0.21 0.41 0 0.00 0.00 0.00 1 ▇▁▁▁▂
engine2_position_unknown 0 1 0.28 0.45 0 0.00 0.00 1.00 1 ▇▁▁▁▃
engine2_position_other 0 1 0.05 0.22 0 0.00 0.00 0.00 1 ▇▁▁▁▁
airport_other 0 1 0.85 0.36 0 1.00 1.00 1.00 1 ▂▁▁▁▇
state_FL 0 1 0.06 0.24 0 0.00 0.00 0.00 1 ▇▁▁▁▁
state_NY 0 1 0.04 0.20 0 0.00 0.00 0.00 1 ▇▁▁▁▁
state_TX 0 1 0.08 0.27 0 0.00 0.00 0.00 1 ▇▁▁▁▁
state_unknown 0 1 0.17 0.38 0 0.00 0.00 0.00 1 ▇▁▁▁▂
state_other 0 1 0.56 0.50 0 0.00 1.00 1.00 1 ▆▁▁▁▇
faa_region_AGL 0 1 0.13 0.34 0 0.00 0.00 0.00 1 ▇▁▁▁▁
faa_region_ANM 0 1 0.08 0.26 0 0.00 0.00 0.00 1 ▇▁▁▁▁
faa_region_ASO 0 1 0.16 0.37 0 0.00 0.00 0.00 1 ▇▁▁▁▂
faa_region_ASW 0 1 0.11 0.32 0 0.00 0.00 0.00 1 ▇▁▁▁▁
faa_region_AWP 0 1 0.12 0.33 0 0.00 0.00 0.00 1 ▇▁▁▁▁
faa_region_unknown 0 1 0.15 0.36 0 0.00 0.00 0.00 1 ▇▁▁▁▂
faa_region_other 0 1 0.10 0.30 0 0.00 0.00 0.00 1 ▇▁▁▁▁
flight_phase_CLIMB 0 1 0.16 0.36 0 0.00 0.00 0.00 1 ▇▁▁▁▂
flight_phase_LANDING.ROLL 0 1 0.11 0.32 0 0.00 0.00 0.00 1 ▇▁▁▁▁
flight_phase_TAKEOFF.RUN 0 1 0.13 0.34 0 0.00 0.00 0.00 1 ▇▁▁▁▁
flight_phase_unknown 0 1 0.23 0.42 0 0.00 0.00 0.00 1 ▇▁▁▁▂
flight_phase_other 0 1 0.09 0.29 0 0.00 0.00 0.00 1 ▇▁▁▁▁
visibility_NIGHT 0 1 0.22 0.42 0 0.00 0.00 0.00 1 ▇▁▁▁▂
visibility_unknown 0 1 0.28 0.45 0 0.00 0.00 1.00 1 ▇▁▁▁▃
visibility_other 0 1 0.05 0.22 0 0.00 0.00 0.00 1 ▇▁▁▁▁
precipitation_unknown 0 1 0.42 0.49 0 0.00 0.00 1.00 1 ▇▁▁▁▆
precipitation_other 0 1 0.06 0.23 0 0.00 0.00 0.00 1 ▇▁▁▁▁
species_name_UNKNOWN.SMALL.BIRD 0 1 0.12 0.33 0 0.00 0.00 0.00 1 ▇▁▁▁▁
species_name_other 0 1 0.61 0.49 0 0.00 1.00 1.00 1 ▅▁▁▁▇
species_quantity_X2.10 0 1 0.16 0.37 0 0.00 0.00 0.00 1 ▇▁▁▁▂
species_quantity_other 0 1 0.03 0.18 0 0.00 0.00 0.00 1 ▇▁▁▁▁
flight_impact_unknown 0 1 0.33 0.47 0 0.00 0.00 1.00 1 ▇▁▁▁▃
flight_impact_other 0 1 0.18 0.39 0 0.00 0.00 0.00 1 ▇▁▁▁▂


Specify model

xgboost_spec <- boost_tree(
    trees = 2000, 
    min_n = 23, 
    tree_depth = 15, 
    learn_rate = 0.00591673615298013, 
    loss_reduction = 3.41525742335076e-07, 
    sample_size = 0.648717504832894
  ) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") %>%
  set_args(importance = "impurity")

Specify workflow

xgboost_workflow <- workflow() %>% 
  add_recipe(rec) %>% 
  add_model(xgboost_spec) 

xgboost_workflow
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()

── Preprocessor ────────────────────────────────────────────────────────────────
12 Recipe Steps

• step_mutate()
• step_mutate()
• step_rm()
• step_indicate_na()
• step_impute_mean()
• step_string2factor()
• step_unknown()
• step_novel()
• step_other()
• step_dummy()
• ...
• and 2 more steps.

── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (classification)

Main Arguments:
  trees = 2000
  min_n = 23
  tree_depth = 15
  learn_rate = 0.00591673615298013
  loss_reduction = 3.41525742335076e-07
  sample_size = 0.648717504832894

Engine-Specific Arguments:
  importance = impurity

Computational engine: xgboost 

Tune

Tuning happened but won’t be included because it took so long.

set.seed(34379)
xgboost_tune <- tune_grid(
  xgboost_workflow, 
  resamples = train_folds, 
  grid = 9
 )

Tuning outcome

The results were:

  • trees = 2000
  • min_n = 23
  • tree_depth = 15
  • learn_rate = 0.00591673615298013
  • loss_reduction = 3.41525742335076e-07
  • sample_size = 0.648717504832894
show_best(xgboost_tune, metric = "roc_auc") %>%
  gt()

autoplot(xgboost_tune) +
  theme_bw()

Finalise model

final_wkflow <- xgboost_workflow #%>%
  #finalize_workflow(select_best(xgboost_tune, metric = "roc_auc"))

final_wkflow
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()

── Preprocessor ────────────────────────────────────────────────────────────────
12 Recipe Steps

• step_mutate()
• step_mutate()
• step_rm()
• step_indicate_na()
• step_impute_mean()
• step_string2factor()
• step_unknown()
• step_novel()
• step_other()
• step_dummy()
• ...
• and 2 more steps.

── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (classification)

Main Arguments:
  trees = 2000
  min_n = 23
  tree_depth = 15
  learn_rate = 0.00591673615298013
  loss_reduction = 3.41525742335076e-07
  sample_size = 0.648717504832894

Engine-Specific Arguments:
  importance = impurity

Computational engine: xgboost 

Fit model

xgboost_fit <- last_fit(final_wkflow, split)

Check performance

collect_metrics(xgboost_fit) %>%
  gt()
.metric .estimator .estimate .config
accuracy binary 0.7558095 Preprocessor1_Model1
roc_auc binary 0.8740577 Preprocessor1_Model1

Get predictions

xgboost_preds <- collect_predictions(xgboost_fit)

ROC Curve

xgboost_preds %>%
  roc_curve(damaged, .pred_1) %>% 
  autoplot()

Apply to new data

have_scored <- cbind(
    to_score %>% select(id), 
    predict(xgboost_fit$.workflow[[1]], to_score, type = 'prob')
  ) %>%
  select(id, damaged = .pred_1)

Output submission

write_csv(have_scored, 's01e02/submission.csv')

Outcome

Score of 0.42

Learnings

  • Check whether the submission is for class or probability and use type = 'prob' in the predict() statement for the latter. 🥥
  • Used several new steps

More to learn

  • Get the variable name into the chart titles when using a function
  • Look at variable importance
banner